###################################################################
##################      BIODIVERSITY REPLICATION    ###############
###################################################################

#Code accompanying:
#Andrew Piper, "Biodiversity is not declining in fiction," Journal of Cultural Analytics (2022)

library(ggplot2)
library(reshape2)
library(segmented)
library(plyr)
library(patchwork)

################   OUTLINE   ####################
#We measure the rise/fall of biodiversity using the following measures, methods, and datasets

#Datasets
#1. Project Gutenberg (PG) described in the original paper
#2. Hathi1M described here: https://openhumanitiesdata.metajnl.com/articles/10.5334/johd.71/
#3. Stanford + Chicago (SC) described in Piper,A. Enumerations (2018)

#Methods
#A. Database lexicon matching (DB)
#B. Supervised machine learning (ML) using bookNLP

#Measures
#i.   Abundance = token frequency
#ii.  Richness = type frequency (normalized)
#iii. Diversity = type entropy (normalized)


##############    DATA SET REVIEW    ########################

######## FIG. 1 HISTOGRAMS OF WORKS BY DECADE #########

######### PG ########
setwd("~/Research/biodiversity")
pg<-read.csv("PG_metadata_bio.csv")

#transform into decade
pg$date<-as.character(pg$pub.date)
pg$decade<-pg$date
substring(pg$decade, 4, 4)<-"0"
pg$decade<-as.numeric(pg$decade)

#subset by final data
results<-read.csv("PG_Results.csv")
results<-results[results$year > 1699,]
pg<-pg[pg$id %in% gsub(".txt","", results$filename),]

#make histograms of date distribution by decade
p1<-ggplot(pg, aes(decade))+
  geom_histogram(binwidth = 10, color="black", fill="white") +
  theme_classic() +
  labs(x = "", y = "Number Works", title = "Project Gutenberg (PG)") +
  theme(plot.title = element_text(hjust = 0.5)) +
  xlim(1700, 2000)

#### STANFORD + CHICAGO  ######
setwd("~/Data")
#load data
stan<-read.csv("19C_Stanford_Novel_750_Meta.csv")
chic<-read.csv("CHICAGO_SAMPLE_1K.csv")

#shape
stan<-cbind(stan$FILENAME, stan$DATE)
chic<-cbind(chic$FILENAME, chic$PUBL_DATE)
colnames(stan)<-c("filename","date")
colnames(chic)<-c("filename","date")

#combine
sc<-data.frame(rbind(stan, chic))

#get decade count
sc$date<-as.character(sc$date)
sc$decade<-sc$date
substring(sc$decade, 4, 4)<-"0"
sc$decade<-as.numeric(sc$decade)

#make histograms of date distribution by decade
p2<-ggplot(sc, aes(decade))+
  geom_histogram(binwidth = 10, color="black", fill="white") +
  theme_classic() +
  labs(x = "", y = "", title = "Stanford+Chicago (SC)") +
  theme(plot.title = element_text(hjust = 0.5))

#### HATHI 1M ####
setwd("~/Data/Hathi1M")
hath<-read.csv("Fiction_metadata.tsv", sep="\t")
p3<-ggplot(hath, aes(year))+
  geom_histogram(binwidth = 10, color="black", fill="white") +
  theme_classic() +
  labs(x = "", y = "Number Pages", title = "Hathi1M") +
  theme(plot.title = element_text(hjust = 0.5))

p1+p2+p3+plot_layout(ncol=2)


########## GET TOKEN COUNT FOR EACH DATA SET   ###############

### PG ####
setwd("~/Data/gut_BIO")
filenames<-list.files()
count.pg<-NULL
for (i in 1:length(filenames)){
  print(i)
  b<-scan(filenames[i], what="character", quote="", quiet=T)
  c<-unlist(strsplit(b, '\\s+|(?<=[a-z])(?=[[:punct:]])', perl = TRUE))
  fileID<-filenames[i]
  count<-length(c)
  temp.df<-data.frame(fileID, count)
  count.pg<-rbind(count.pg, temp.df)
}
setwd("~/Research/biodiversity")
pg<-read.csv("PG_Results.csv")
pg<-pg[pg$year > 1699,]
count.pg2<-count.pg[gsub(".txt","",count.pg$fileID) %in% pg$filename,]
sum(count.pg2$count)
pg.meta<-read.csv("PG_metadata_bio.csv")
pg.count3<-count.pg[gsub(".txt","",count.pg$fileID) %in% pg.meta$id,]
pg.count4<-pg.count3[pg.count3$count > 15000,]

### SC ####
setwd("~/Data/bio_StanChicago")
filenames<-list.files()
count<-vector()
for (i in 1:length(filenames)){
  print(i)
  b<-read.csv(filenames[i], sep="\t", quote="")
  count<-append(count, nrow(b))
}

### HATHI1M ####
h1<-read.csv("Enriched_Feature_Set_Fiction.csv")
h2<-read.csv("Enriched_Feature_Set_NONfiction.csv")

sum(h1$TotalWords)+sum(h2$TotalWords)

########### FIG. 2 RANKED WORD LISTS BY METHOD  ##############

#Here we show ranked lists of taxa based on the two methods
#This uses the Stanford+Chicago data
setwd("~/Research/biodiversity")
ml.t<-read.csv("PG_Taxa_List_ML.csv")
db.t<-read.csv("PG_Taxa_List_DB.csv")

#reorder
ml.t$taxa.ml<-factor(ml.t$taxa.ml, levels = ml.t$taxa.ml[order(ml.t$Freq)])
#ggplot(nlp.t[1:30,], aes(x=taxa.ml, y=Freq, fill=Freq))+
p1<-ggplot(ml.t[1:30,], aes(x=taxa.ml, y=Freq))+
  geom_bar(stat="identity", show.legend = FALSE) +
  coord_flip() +
  theme_classic() +
  #scale_fill_gradient(low='grey', high='black') +
  labs(y = "Frequency", x = "", title = "ML Method")
  
db.t$taxa.db<-factor(db.t$taxa.db, levels = db.t$taxa.db[order(db.t$Freq)])
#ggplot(db.t[1:30,], aes(x=taxa.db, y=Freq, fill=Freq))+
p2<-ggplot(db.t[1:30,], aes(x=taxa.db, y=Freq))+
  geom_bar(stat="identity", show.legend = FALSE) +
  coord_flip() +
  theme_classic() +
  #scale_fill_gradient(low='grey', high='black') +
  labs(y = "Frequency", x = "", title = "DB Method")

#make single figure
p1 + p2 + plot_layout(ncol = 2)


##############   METHOD VALIDATION   ########################
#We validate the two measures by hand-annotating a small sample of passages.
#Due to the rarity of the event, we sample 100 50-word passages that begin with a sentence where a taxon has been predicted.
#Thus all of our samples have at least one biological taxon in them along with numerous other kinds of entities.
#this will bias our validation in favor of a method that overcounts (it has a better chance of being right than out in the wild)

#ingest validation table
setwd("~/Research/biodiversity")
val<-read.csv("Validation_Bio.csv")

#calculate F1 for each method

#ML
ml.pos<-val[grep("noun.animal|noun.plant", val$supersense),]
ml.neg<-val[-grep("noun.animal|noun.plant", val$supersense),]
  
tp<-ml.pos[which(ml.pos$bio == "x"),]
fp<-ml.pos[which(ml.pos$bio != "x"),]
tn<-ml.neg[which(ml.neg$bio != "x"),]
fn<-ml.neg[which(ml.neg$bio == "x"),]

prec<-nrow(tp)/(nrow(tp)+nrow(fp))
rec<-nrow(tp)/(nrow(tp)+nrow(fn))
spec<-nrow(tn)/(nrow(tn)+nrow(fp))
f1.ml<-2*((prec*rec)/(prec+rec))

#DB
#ingest taxon list
tax<-read.csv("TaxonDB_english.csv")
bl<-read.csv("Blacklist.txt", header=F)
#remove blacklist
tax<-tax[which(!tax$Term %in% bl$V1),]

#add code for rows
val$code<-seq(1:nrow(val))

#remove stopwords
#val.s<-val[which(!val$lemma %in% stopwords("en")),]

#optional: remove non-nouns
val.s<-val.s[val.s$pos %in% c("NN", "NNS"),]

db.pos<-val.s[which(val.s$lemma %in% tax$Term),]
#db.neg<-val[which(!val$code %in% db.pos$code),]
db.neg<-val.s[which(!val.s$lemma %in% tax$Term),]

tp<-db.pos[which(db.pos$bio == "x"),]
fp<-db.pos[which(db.pos$bio != "x"),]
tn<-db.neg[which(db.neg$bio != "x"),]
fn<-db.neg[which(db.neg$bio == "x"),]

prec<-nrow(tp)/(nrow(tp)+nrow(fp))
rec<-nrow(tp)/(nrow(tp)+nrow(fn))
spec<-nrow(tn)/(nrow(tn)+nrow(fp))
f1.db<-2*((prec*rec)/(prec+rec))

#observe differences
db.notml<-db.pos[!db.pos$lemma %in% ml.pos$lemma,]
ml.notdb<-ml.pos[!ml.pos$lemma %in% db.pos$lemma,]


##############   ESTIMATING MEASURES   #############

########## ABUNDANCE ##############

##########     PG    ##############

setwd("~/Research/biodiversity")
a<-read.csv("PG_Results.csv")

#remove pre-1750 texts
a<-a[a$year > 1699,]

#transform into 5 year intervals
n <- 5
#save final digit as vector
five.year<-as.numeric(substr(a$year, 4, 4))
#round down to nearest 5 and append to date
five.year<-round_any(five.year, n, f = floor)
#replace column
a$year2<-as.character(a$year)
substring(a$year2,4,4)<-as.character(five.year)
a$year2<-as.numeric(a$year2)

#get 5 year average for each measure
#a.mean<-aggregate(a[,3:12],by=list(factor(a$year2)), mean)
#colnames(a.mean)[1]<-c("year")
#a.mean$year<-as.numeric(as.character(a.mean$year))

#bootstrap 5-year means
a.mean<-NULL
for (i in 1:nlevels(factor(as.character(a$year2)))){
  print(i)
  sub<-a[a$year2 == levels(factor(as.character(a$year2)))[i],]
  mean.df<-NULL
  for (j in 1:1000){
    sub2<-sub[sample(nrow(sub), nrow(sub), replace = T),]
    sub3<-t(data.frame(colMeans(sub2[,3:12])))
    mean.df<-rbind(mean.df, sub3)
  }
  mean.df<-data.frame(mean.df)
  mean.df$year<-as.numeric(levels(factor(as.character(a$year2)))[i])
  a.mean<-rbind(a.mean, t(data.frame(colMeans(mean.df))))
}
a.mean<-data.frame(a.mean)

#get correlation between two methods
cor.test(a.mean$ttr.ml, a.mean$ttr.db)
cor.test(a.mean$entropy.ml, a.mean$entropy.db)
cor.test(a.mean$freq.ml, a.mean$freq.db)

#ML
a.ml<-data.frame(a.mean$year, a.mean$freq.ml)
colnames(a.ml)<-c("year", "x")
#breakpoint analysis
ml.lm<-lm(x ~ year, data=a.ml)
ml.seg <- segmented(ml.lm, seg.Z = ~ year, psi = list(year = c(1835)))
summary(ml.seg)
ml.fitted <- fitted(ml.seg)
ml.model <- data.frame(Year = a.ml$year, Frequency = ml.fitted)
ml.lines <- round_any(ml.seg$psi[, 2], 5, f = round)
ml.text<-as.character(unname(round_any(ml.seg$psi[, 2], 5, f = round)))
ml.summary<-summary(ml.seg)
if(ml.summary$Ttable[2,4]<.001){p<-.001} else {p<-substr(ml.summary$Ttable[2,4], 1,5)}
R2<-substr(ml.summary$adj.r.squared, 1,4)
ml.p<-paste("R = ", R2," | ", "p < ", p, sep="" )

#DB
a.db<-data.frame(a.mean$year, a.mean$freq.db)
colnames(a.db)<-c("year", "x")
#breakpoint analysis
db.lm<-lm(x ~ year, data=a.db)
db.seg <- segmented(db.lm, seg.Z = ~ year, psi = list(year = c(1835)))
summary(db.seg)
db.fitted <- fitted(db.seg)
db.model <- data.frame(Year = a.db$year, Frequency = db.fitted)
db.lines <- round_any(db.seg$psi[, 2], 5, f = round)
db.text<-as.character(unname(round_any(db.seg$psi[, 2], 5, f = round)))
db.summary<-summary(db.seg)
if(db.summary$Ttable[2,4]<.001){p<-.001} else {p<-substr(db.summary$Ttable[2,4], 1,5)}
R2<-substr(db.summary$adj.r.squared, 1,4)
db.p<-paste("R = ", R2," | ", "p < ", p, sep="" )

#plot
pg.ml<-ggplot(a.ml, aes(x=year, y=x))+
  geom_point(color="black") +
  geom_line(data = ml.model, aes(x = Year, y = Frequency), size=.5) +
  theme_classic() +
  geom_vline(xintercept = ml.lines) +
  geom_text(x=ml.lines[1]+12, y=.0007, label=ml.text[1], size=3.5, check_overlap = TRUE) +
  geom_text(x=1920, y=.0007, label=ml.p, size=3.5, check_overlap = TRUE) +
  labs(x = "Year", y = "Frequency",
       title="Method = ML")

#renormalize for plotting
pg.db<-ggplot(a.db, aes(x=year, y=x))+
  geom_point(color="black") +
  geom_line(data = db.model, aes(x = Year, y = Frequency), size=.5) +
  theme_classic() +
  geom_vline(xintercept = db.lines) +
  geom_text(x=db.lines[1]+12, y=.0024, label=db.text[1], size=3.5, check_overlap = TRUE) +
  geom_text(x=1920, y=.0024, label=db.p, size=3.5, check_overlap = TRUE) +
  labs(x = "Year", y = "",
       title="Method = DB")

#combine
#sc.ml+sc.db+plot_layout(ncol = 2)+plot_annotation(title="Abundance",
#                                                  theme=theme(plot.title = element_text(hjust=0.5)))


######## PG ONLY FICTION ##########
setwd("~/Research/biodiversity")
a<-read.csv("PG_Results.csv")
#remove pre-1750 texts
a<-a[a$year > 1699,]
#transform into 5 year intervals
n <- 5
#save final digit as vector
five.year<-as.numeric(substr(a$year, 4, 4))
#round down to nearest 5 and append to date
five.year<-round_any(five.year, n, f = floor)
#replace column
a$year2<-as.character(a$year)
substring(a$year2,4,4)<-as.character(five.year)
a$year2<-as.numeric(a$year2)
pg.meta<-read.csv("PG_metadata_bio.csv")
pg.meta<-pg.meta[pg.meta$id %in% gsub(".txt","",a$filename),]

#keep only texts labeled "fiction" in "subjects" column
pg.fic<-pg.meta[grep("fiction", pg.meta$subjects, ignore.case = T),]
a.fic<-a[gsub(".txt","",a$filename) %in% pg.fic$id,]

#rerun bootstrapping

#bootstrap 5-year means
a.mean<-NULL
for (i in 1:nlevels(factor(as.character(a.fic$year2)))){
  print(i)
  sub<-a.fic[a.fic$year2 == levels(factor(as.character(a.fic$year2)))[i],]
  mean.df<-NULL
  for (j in 1:1000){
    sub2<-sub[sample(nrow(sub), nrow(sub), replace = T),]
    sub3<-t(data.frame(colMeans(sub2[,3:12])))
    mean.df<-rbind(mean.df, sub3)
  }
  mean.df<-data.frame(mean.df)
  mean.df$year<-as.numeric(levels(factor(as.character(a.fic$year2)))[i])
  a.mean<-rbind(a.mean, t(data.frame(colMeans(mean.df))))
}
a.mean<-data.frame(a.mean)

#remove outlier year
a.mean<-a.mean[-which(a.mean$freq.ml>(2*sd(a.mean$freq.ml))+mean(a.mean$freq.ml)),]

#ML
a.ml<-data.frame(a.mean$year, a.mean$freq.ml)
colnames(a.ml)<-c("year", "x")
#breakpoint analysis
ml.lm<-lm(x ~ year, data=a.ml)
ml.seg <- segmented(ml.lm, seg.Z = ~ year, psi = list(year = c(1835)))
summary(ml.seg)
ml.fitted <- fitted(ml.seg)
ml.model <- data.frame(Year = a.ml$year, Frequency = ml.fitted)
ml.lines <- round_any(ml.seg$psi[, 2], 5, f = round)
ml.text<-as.character(unname(round_any(ml.seg$psi[, 2], 5, f = round)))
ml.summary<-summary(ml.seg)
if(ml.summary$Ttable[2,4]<.001){p<-.001} else {p<-substr(ml.summary$Ttable[2,4], 1,5)}
R2<-substr(ml.summary$adj.r.squared, 1,4)
ml.p<-paste("R = ", R2," | ", "p < ", p, sep="" )

#DB
a.db<-data.frame(a.mean$year, a.mean$freq.db)
colnames(a.db)<-c("year", "x")
#breakpoint analysis
db.lm<-lm(x ~ year, data=a.db)
db.seg <- segmented(db.lm, seg.Z = ~ year, psi = list(year = c(1835)))
summary(db.seg)
db.fitted <- fitted(db.seg)
db.model <- data.frame(Year = a.db$year, Frequency = db.fitted)
db.lines <- round_any(db.seg$psi[, 2], 5, f = round)
db.text<-as.character(unname(round_any(db.seg$psi[, 2], 5, f = round)))
db.summary<-summary(db.seg)
if(db.summary$Ttable[2,4]<.001){p<-.001} else {p<-substr(db.summary$Ttable[2,4], 1,5)}
R2<-substr(db.summary$adj.r.squared, 1,4)
db.p<-paste("R = ", R2," | ", "p < ", p, sep="" )

#plot
pg.fic.ml<-ggplot(a.ml, aes(x=year, y=x))+
  geom_point(color="black") +
  geom_line(data = ml.model, aes(x = Year, y = Frequency), size=.5) +
  theme_classic() +
  geom_vline(xintercept = ml.lines) +
  geom_text(x=ml.lines[1]+12, y=.00025, label=ml.text[1], size=3.5, check_overlap = TRUE) +
  geom_text(x=1880, y=.00025, label=ml.p, size=3.5, check_overlap = TRUE) +
  labs(x = "Year", y = "Frequency")
       #title="Dataset: PG - FIC / Method = ML")

pg.fic.db<-ggplot(a.db, aes(x=year, y=x))+
  geom_point(color="black") +
  geom_line(data = db.model, aes(x = Year, y = Frequency), size=.5) +
  theme_classic() +
  geom_vline(xintercept = db.lines) +
  geom_text(x=db.lines[1]+12, y=.0016, label=db.text[1], size=3.5, check_overlap = TRUE) +
  geom_text(x=1880, y=.0016, label=db.p, size=3.5, check_overlap = TRUE) +
  labs(x = "Year", y = "")
       #title="Dataset: PG - FIC / Method = DB")

#combine
#sc.ml+sc.db+plot_layout(ncol = 2)+plot_annotation(title="Abundance",
#                                                  theme=theme(plot.title = element_text(hjust=0.5)))

###########    SC    ##############

setwd("~/Research/biodiversity")
a<-read.csv("Stan_Chicago_Results.csv")

#transform into 5 year intervals
n <- 5
#save final digit as vector
five.year<-as.numeric(substr(a$year, 4, 4))
#round down to nearest 5 and append to date
five.year<-round_any(five.year, n, f = floor)
#replace column
a$year2<-as.character(a$year)
substring(a$year2,4,4)<-as.character(five.year)
a$year2<-as.numeric(a$year2)

#get 5 year average for each measure
a.mean<-aggregate(a[,3:12],by=list(factor(a$year2)), mean)
colnames(a.mean)[1]<-c("year")
a.mean$year<-as.numeric(as.character(a.mean$year))

#get correlation between two methods
cor.test(a.mean$freq.ml, a.mean$freq.db)
cor.test(a.mean$ttr.ml, a.mean$ttr.db)
cor.test(a.mean$entropy.ml, a.mean$entropy.db)

#ML
a.ml<-data.frame(a.mean$year, a.mean$freq.ml)
colnames(a.ml)<-c("year", "x")
#breakpoint analysis
ml.lm<-lm(x ~ year, data=a.ml)
ml.seg <- segmented(ml.lm, seg.Z = ~ year, psi = list(year = c(1835)))
summary(ml.seg)
ml.fitted <- fitted(ml.seg)
ml.model <- data.frame(Year = a.ml$year, Frequency = ml.fitted)
ml.lines <- round_any(ml.seg$psi[, 2], 5, f = round)
ml.text<-as.character(unname(round_any(ml.seg$psi[, 2], 5, f = round)))
ml.summary<-summary(ml.seg)
if(ml.summary$Ttable[2,4]<.001){p<-.001} else {p<-ml.summary$Ttable[2,4]}
R2<-substr(ml.summary$adj.r.squared, 1,4)
#ml.p<-c("R = 0.64 | p < 0.001")
ml.p<-paste("R = ", R2," | ", "p < ", p, sep="" )

#DB
a.db<-data.frame(a.mean$year, a.mean$freq.db)
colnames(a.db)<-c("year", "x")
#breakpoint analysis
db.lm<-lm(x ~ year, data=a.db)
db.seg <- segmented(db.lm, seg.Z = ~ year, psi = list(year = c(1835)))
summary(db.seg)
db.fitted <- fitted(db.seg)
db.model <- data.frame(Year = a.db$year, Frequency = db.fitted)
db.lines <- round_any(db.seg$psi[, 2], 5, f = round)
db.text<-as.character(unname(round_any(db.seg$psi[, 2], 5, f = round)))
db.summary<-summary(db.seg)
if(db.summary$Ttable[2,4]<.001){p<-.001} else {p<-db.summary$Ttable[2,4]}
R2<-substr(db.summary$adj.r.squared, 1,4)
db.p<-paste("R = ", R2," | ", "p < ", p, sep="" )

#plot
sc.ml<-ggplot(a.ml, aes(x=year, y=x))+
  geom_point(color="black") +
  geom_line(data = ml.model, aes(x = Year, y = Frequency), size=.5) +
  theme_classic() +
  geom_vline(xintercept = ml.lines) +
  geom_text(x=ml.lines[1]+10, y=.0007, label=ml.text[1], size=3.5, check_overlap = TRUE) +
  geom_text(x=1960, y=.0007, label=ml.p, size=3.5, check_overlap = TRUE) +
  labs(x = "Year", y = "Frequency")
       #title="Dataset: SC / Method = ML")

sc.db<-ggplot(a.db, aes(x=year, y=x))+
  geom_point(color="black") +
  geom_line(data = db.model, aes(x = Year, y = Frequency), size=.5) +
  theme_classic() +
  geom_vline(xintercept = db.lines) +
  geom_text(x=db.lines[1]+10, y=.0021, label=db.text[1], size=3.5, check_overlap = TRUE) +
  geom_text(x=1960, y=.0021, label=db.p, size=3.5, check_overlap = TRUE) +
  labs(x = "Year", y = "")
       #title="Dataset: SC / Method = DB Nouns")

#combine
#sc.ml+sc.db+plot_layout(ncol = 2)+plot_annotation(title="Abundance",
#                                                  theme=theme(plot.title = element_text(hjust=0.5)))

########### HATHI 1M  ##############

#load data
setwd("~/Data/Hathi1M")
a<-read.csv("Hathi1M_YearlyData.csv")

#create combined plant/animal category
a$plant.animal<-a$noun.plant+a$noun.animal

#reduce into 5 year segments
#first divide by genre
a1<-a[a$genre == "FIC",]
a2<-a[a$genre == "NON",]
#then subset into 5 year averages
n <- 5
a1<-aggregate(a1$plant.animal, list(rep(1:(nrow(a1) %/% n + 1), each = n, len = nrow(a1))), mean)[-1]
a1$year<-seq(from=1805, to=2000, by=5)
a2<-aggregate(a2$plant.animal, list(rep(1:(nrow(a2) %/% n + 1), each = n, len = nrow(a2))), mean)[-1]
a2$year<-seq(from=1805, to=2000, by=5)
a1$genre<-c("FIC")
a2$genre<-c("NON")

#####    FIC   ######

#breakpoint analysis
fic.lm<-lm(x ~ year, data=a1)
fic.seg <- segmented(fic.lm, 
                    seg.Z = ~ year, 
                    psi = list(year = c(1835)))
fic.fitted <- fitted(fic.seg)
fic.model <- data.frame(Year = a1$year, Frequency = fic.fitted)
fic.lines <- round_any(fic.seg$psi[, 2], 5, f = round)
fic.text<-as.character(unname(round_any(fic.seg$psi[, 2], 5, f = round)))
fic.summary<-summary(fic.seg)
if(fic.summary$Ttable[2,4]<.001){p<-.001} else {p<-substr(fic.summary$Ttable[2,4], 1,5)}
R2<-substr(fic.summary$adj.r.squared, 1,4)
fic.p<-paste("R = ", R2," | ", "p < ", p, sep="" )

hathi.fic<-ggplot(a1, aes(x=year, y=x))+
  geom_point(color="black") +
  geom_line(data = fic.model, aes(x = Year, y = Frequency), size=.5) +
  theme_classic() +
  geom_vline(xintercept = fic.lines) +
  geom_text(x=fic.lines[1]+10, y=.002, label=fic.text[1], size=3.5, check_overlap = TRUE) +
  geom_text(x=1960, y=.002, label=fic.p, size=3.5, check_overlap = TRUE) +
  labs(x = "", y = "Frequency",
       title="FIC")

#####   NON   #######
#breakpoint analysis
non.lm<-lm(x ~ year, data=a2)
non.seg <- segmented(non.lm, 
                     seg.Z = ~ year, 
                     psi = list(year = c(1945)))
non.fitted <- fitted(non.seg)
non.model <- data.frame(Year = a2$year, Frequency = non.fitted)
non.lines <- round_any(non.seg$psi[, 2], 5, f = round)
non.text<-as.character(unname(round_any(non.seg$psi[, 2], 5, f = round)))
non.summary<-summary(non.seg)
if(non.summary$Ttable[2,4]<.001){p<-.001} else {p<-substr(non.summary$Ttable[2,4], 1,5)}
R2<-substr(non.summary$adj.r.squared, 1,4)
non.p<-paste("R = ", R2," | ", "p < ", p, sep="" )

hathi.non<-ggplot(a2, aes(x=year, y=x))+
  geom_point(color="black") +
  geom_line(data = non.model, aes(x = Year, y = Frequency), size=.5) +
  theme_classic() +
  geom_vline(xintercept = non.lines) +
  geom_text(x=non.lines[1]+10, y=.0028, label=non.text[1], size=3.5, check_overlap = TRUE) +
  geom_text(x=1840, y=.0028, label=non.p, size=3.5, check_overlap = TRUE) +
  labs(x = "", y = "", 
       title="NON")

#hathi.fic+hathi.non+plot_layout(ncol = 2)+plot_annotation(title="Abundance",
#                                                  theme=theme(plot.title = element_text(hjust=0.5)))

#plot with regression no breakpoints
ggplot(a1, aes(x=year, y=x))+
  geom_point(color="black") +
  theme_classic() +
  geom_smooth(method=lm, formula = y ~ poly(x,2), color="black", se=F, size=.5)+
  labs(x = "Year", y = "Frequency", title = "Frequency of Plants and Animals in English Prose, 1800-2000",
       subtitle="N=1,671,370", caption="Source: Hathi1M")

####### FIG. 4 - ABUNDANCE ########
pg.ml+pg.db+pg.fic.ml+pg.fic.db+sc.ml+sc.db+hathi.fic+hathi.non+plot_layout(ncol=2)+
  plot_annotation(title="Abundance", theme=theme(plot.title = element_text(hjust=0.5)))+
  plot_annotation(tag_levels = list(c("PG", "", "PG - FIC","", "SC","", "HATHI1M","")))


########## RICHNESS ##########

###### PG #######

setwd("~/Research/biodiversity")
a<-read.csv("PG_Results.csv")

#remove pre-1750 texts
a<-a[a$year > 1699,]

#transform into 5 year intervals
n <- 5
#save final digit as vector
five.year<-as.numeric(substr(a$year, 4, 4))
#round down to nearest 5 and append to date
five.year<-round_any(five.year, n, f = floor)
#replace column
a$year2<-as.character(a$year)
substring(a$year2,4,4)<-as.character(five.year)
a$year2<-as.numeric(a$year2)

#bootstrap 5-year means
a.mean<-NULL
for (i in 1:nlevels(factor(as.character(a$year2)))){
  print(i)
  sub<-a[a$year2 == levels(factor(as.character(a$year2)))[i],]
  mean.df<-NULL
  for (j in 1:1000){
    sub2<-sub[sample(nrow(sub), nrow(sub), replace = T),]
    sub3<-t(data.frame(colMeans(sub2[,3:12])))
    mean.df<-rbind(mean.df, sub3)
  }
  mean.df<-data.frame(mean.df)
  mean.df$year<-as.numeric(levels(factor(as.character(a$year2)))[i])
  a.mean<-rbind(a.mean, t(data.frame(colMeans(mean.df))))
}
a.mean<-data.frame(a.mean)

#ML
a.ml<-data.frame(a.mean$year, a.mean$ttr.ml)
colnames(a.ml)<-c("year", "x")
#breakpoint analysis
ml.lm<-lm(x ~ year, data=a.ml)
ml.seg <- segmented(ml.lm, seg.Z = ~ year, psi = list(year = c(1835)))
summary(ml.seg)
ml.fitted <- fitted(ml.seg)
ml.model <- data.frame(Year = a.ml$year, Frequency = ml.fitted)
ml.lines <- round_any(ml.seg$psi[, 2], 5, f = round)
ml.text<-as.character(unname(round_any(ml.seg$psi[, 2], 5, f = round)))
ml.summary<-summary(ml.seg)
if(ml.summary$Ttable[2,4]<.001){p<-.001} else {p<-substr(ml.summary$Ttable[2,4],1,5)}
R2<-substr(ml.summary$adj.r.squared, 1,4)
ml.p<-paste("R = ", R2," | ", "p < ", p, sep="" )

#DB
a.db<-data.frame(a.mean$year, a.mean$ttr.db)
colnames(a.db)<-c("year", "x")
#breakpoint analysis
db.lm<-lm(x ~ year, data=a.db)
db.seg <- segmented(db.lm, seg.Z = ~ year, psi = list(year = c(1835)))
summary(db.seg)
db.fitted <- fitted(db.seg)
db.model <- data.frame(Year = a.db$year, Frequency = db.fitted)
db.lines <- round_any(db.seg$psi[, 2], 5, f = round)
db.text<-as.character(unname(round_any(db.seg$psi[, 2], 5, f = round)))
db.summary<-summary(db.seg)
if(db.summary$Ttable[2,4]<.001){p<-.001} else {p<-substr(db.summary$Ttable[2,4], 1,5)}
R2<-substr(db.summary$adj.r.squared, 1,4)
db.p<-paste("R = ", R2," | ", "p < ", p, sep="" )

#plot
pg.ml<-ggplot(a.ml, aes(x=year, y=x))+
  geom_point(color="black") +
  geom_line(data = ml.model, aes(x = Year, y = Frequency), size=.5) +
  theme_classic() +
  geom_vline(xintercept = ml.lines) +
  geom_text(x=ml.lines[1]+12, y=.25, label=ml.text[1], size=3.5, check_overlap = TRUE) +
  geom_text(x=1920, y=.25, label=ml.p, size=3.5, check_overlap = TRUE) +
  labs(x = "Year", y = "Types per 1K Words", title="Method = ML")

pg.db<-ggplot(a.db, aes(x=year, y=x))+
  geom_point(color="black") +
  geom_line(data = db.model, aes(x = Year, y = Frequency), size=.5) +
  theme_classic() +
  geom_vline(xintercept = db.lines) +
  geom_text(x=db.lines[1]+12, y=1.75, label=db.text[1], size=3.5, check_overlap = TRUE) +
  geom_text(x=1920, y=1.75, label=db.p, size=3.5, check_overlap = TRUE) +
  labs(x = "Year", y = "", title="Method = DB")

####### PG FIC #########
setwd("~/Research/biodiversity")
a<-read.csv("PG_Results.csv")
#remove pre-1750 texts
a<-a[a$year > 1699,]
#transform into 5 year intervals
n <- 5
#save final digit as vector
five.year<-as.numeric(substr(a$year, 4, 4))
#round down to nearest 5 and append to date
five.year<-round_any(five.year, n, f = floor)
#replace column
a$year2<-as.character(a$year)
substring(a$year2,4,4)<-as.character(five.year)
a$year2<-as.numeric(a$year2)
pg.meta<-read.csv("PG_metadata_bio.csv")
pg.meta<-pg.meta[pg.meta$id %in% gsub(".txt","",a$filename),]

#keep only texts labeled "fiction" in "subjects" column
pg.fic<-pg.meta[grep("fiction", pg.meta$subjects, ignore.case = T),]
a.fic<-a[gsub(".txt","",a$filename) %in% pg.fic$id,]

#rerun bootstrapping

#bootstrap 5-year means
a.mean<-NULL
for (i in 1:nlevels(factor(as.character(a.fic$year2)))){
  print(i)
  sub<-a.fic[a.fic$year2 == levels(factor(as.character(a.fic$year2)))[i],]
  mean.df<-NULL
  for (j in 1:1000){
    sub2<-sub[sample(nrow(sub), nrow(sub), replace = T),]
    sub3<-t(data.frame(colMeans(sub2[,3:12])))
    mean.df<-rbind(mean.df, sub3)
  }
  mean.df<-data.frame(mean.df)
  mean.df$year<-as.numeric(levels(factor(as.character(a.fic$year2)))[i])
  a.mean<-rbind(a.mean, t(data.frame(colMeans(mean.df))))
}
a.mean<-data.frame(a.mean)

#remove outlier year
a.mean<-a.mean[-which(a.mean$freq.ml>(2*sd(a.mean$freq.ml))+mean(a.mean$freq.ml)),]

#ML
a.ml<-data.frame(a.mean$year, a.mean$ttr.ml)
colnames(a.ml)<-c("year", "x")
#breakpoint analysis
ml.lm<-lm(x ~ year, data=a.ml)
ml.seg <- segmented(ml.lm, seg.Z = ~ year, psi = list(year = c(1835)))
summary(ml.seg)
ml.fitted <- fitted(ml.seg)
ml.model <- data.frame(Year = a.ml$year, Frequency = ml.fitted)
ml.lines <- round_any(ml.seg$psi[, 2], 5, f = round)
ml.text<-as.character(unname(round_any(ml.seg$psi[, 2], 5, f = round)))
ml.summary<-summary(ml.seg)
if(ml.summary$Ttable[2,4]<.001){p<-.001} else {p<-substr(ml.summary$Ttable[2,4],1,5)}
R2<-substr(ml.summary$adj.r.squared, 1,4)
ml.p<-paste("R = ", R2," | ", "p < ", p, sep="" )

#DB
a.db<-data.frame(a.mean$year, a.mean$ttr.db)
colnames(a.db)<-c("year", "x")
#breakpoint analysis
db.lm<-lm(x ~ year, data=a.db)
db.seg <- segmented(db.lm, seg.Z = ~ year, psi = list(year = c(1835)))
summary(db.seg)
db.fitted <- fitted(db.seg)
db.model <- data.frame(Year = a.db$year, Frequency = db.fitted)
db.lines <- round_any(db.seg$psi[, 2], 5, f = round)
db.text<-as.character(unname(round_any(db.seg$psi[, 2], 5, f = round)))
db.summary<-summary(db.seg)
if(db.summary$Ttable[2,4]<.001){p<-.001} else {p<-substr(db.summary$Ttable[2,4], 1,5)}
R2<-substr(db.summary$adj.r.squared, 1,4)
db.p<-paste("R = ", R2," | ", "p < ", p, sep="" )

#plot
pg.fic.ml<-ggplot(a.ml, aes(x=year, y=x))+
  geom_point(color="black") +
  geom_line(data = ml.model, aes(x = Year, y = Frequency), size=.5) +
  theme_classic() +
  geom_vline(xintercept = ml.lines) +
  geom_text(x=ml.lines[1]+12, y=.10, label=ml.text[1], size=3.5, check_overlap = TRUE) +
  geom_text(x=1880, y=.10, label=ml.p, size=3.5, check_overlap = TRUE) +
  labs(x = "Year", y = "Types per 1K Words")

pg.fic.db<-ggplot(a.db, aes(x=year, y=x))+
  geom_point(color="black") +
  geom_line(data = db.model, aes(x = Year, y = Frequency), size=.5) +
  theme_classic() +
  geom_vline(xintercept = db.lines) +
  geom_text(x=db.lines[1]+12, y=1.0, label=db.text[1], size=3.5, check_overlap = TRUE) +
  geom_text(x=1840, y=1.0, label=db.p, size=3.5, check_overlap = TRUE) +
  labs(x = "Year", y = "")


#######  SC  ########
setwd("~/Research/biodiversity")
a<-read.csv("Stan_Chicago_Results.csv")

#transform into 5 year intervals
n <- 5
#save final digit as vector
five.year<-as.numeric(substr(a$year, 4, 4))
#round down to nearest 5 and append to date
five.year<-round_any(five.year, n, f = floor)
#replace column
a$year2<-as.character(a$year)
substring(a$year2,4,4)<-as.character(five.year)
a$year2<-as.numeric(a$year2)

#get 5 year average for each measure
a.mean<-aggregate(a[,3:12],by=list(factor(a$year2)), mean)
colnames(a.mean)[1]<-c("year")
a.mean$year<-as.numeric(as.character(a.mean$year))

#ML
a.ml<-data.frame(a.mean$year, a.mean$ttr.ml)
colnames(a.ml)<-c("year", "x")
#breakpoint analysis
ml.lm<-lm(x ~ year, data=a.ml)
ml.seg <- segmented(ml.lm, seg.Z = ~ year, psi = list(year = c(1835)))
summary(ml.seg)
ml.fitted <- fitted(ml.seg)
ml.model <- data.frame(Year = a.ml$year, Frequency = ml.fitted)
ml.lines <- round_any(ml.seg$psi[, 2], 5, f = round)
ml.text<-as.character(unname(round_any(ml.seg$psi[, 2], 5, f = round)))
ml.summary<-summary(ml.seg)
if(ml.summary$Ttable[2,4]<.001){p<-.001} else {p<-substr(ml.summary$Ttable[2,4],1,5)}
R2<-substr(ml.summary$adj.r.squared, 1,4)
ml.p<-paste("R = ", R2," | ", "p < ", p, sep="" )

#DB
a.db<-data.frame(a.mean$year, a.mean$ttr.db)
colnames(a.db)<-c("year", "x")
#breakpoint analysis
db.lm<-lm(x ~ year, data=a.db)
db.seg <- segmented(db.lm, seg.Z = ~ year, psi = list(year = c(1835)))
summary(db.seg)
db.fitted <- fitted(db.seg)
db.model <- data.frame(Year = a.db$year, Frequency = db.fitted)
db.lines <- round_any(db.seg$psi[, 2], 5, f = round)
db.text<-as.character(unname(round_any(db.seg$psi[, 2], 5, f = round)))
db.summary<-summary(db.seg)
if(db.summary$Ttable[2,4]<.001){p<-.001} else {p<-substr(db.summary$Ttable[2,4], 1,5)}
R2<-substr(db.summary$adj.r.squared, 1,4)
db.p<-paste("R = ", R2," | ", "p < ", p, sep="" )

#plot
sc.ml<-ggplot(a.ml, aes(x=year, y=x))+
  geom_point(color="black") +
  geom_line(data = ml.model, aes(x = Year, y = Frequency), size=.5) +
  theme_classic() +
  geom_vline(xintercept = ml.lines) +
  geom_text(x=ml.lines[1]+12, y=.4, label=ml.text[1], size=3.5, check_overlap = TRUE) +
  geom_text(x=1960, y=.4, label=ml.p, size=3.5, check_overlap = TRUE) +
  labs(x = "Year", y = "Types per 1K Words")

sc.db<-ggplot(a.db, aes(x=year, y=x))+
  geom_point(color="black") +
  geom_line(data = db.model, aes(x = Year, y = Frequency), size=.5) +
  theme_classic() +
  geom_vline(xintercept = db.lines) +
  geom_text(x=db.lines[1]+12, y=1.6, label=db.text[1], size=3.5, check_overlap = TRUE) +
  geom_text(x=1960, y=1.6, label=db.p, size=3.5, check_overlap = TRUE) +
  labs(x = "Year", y = "")

#######  FIG. 5 - RICHNESS  ########
pg.ml+pg.db+pg.fic.ml+pg.fic.db+sc.ml+sc.db+plot_layout(ncol=2)+
  plot_annotation(title="Richness", theme=theme(plot.title = element_text(hjust=0.5)))+
  plot_annotation(tag_levels = list(c("PG", "", "PG - FIC","", "SC","")))


########## DIVERSITY ############

###### PG #######

setwd("~/Research/biodiversity")
a<-read.csv("PG_Results.csv")

#remove pre-1750 texts
a<-a[a$year > 1699,]

#transform into 5 year intervals
n <- 5
#save final digit as vector
five.year<-as.numeric(substr(a$year, 4, 4))
#round down to nearest 5 and append to date
five.year<-round_any(five.year, n, f = floor)
#replace column
a$year2<-as.character(a$year)
substring(a$year2,4,4)<-as.character(five.year)
a$year2<-as.numeric(a$year2)

#bootstrap 5-year means
a.mean<-NULL
for (i in 1:nlevels(factor(as.character(a$year2)))){
  print(i)
  sub<-a[a$year2 == levels(factor(as.character(a$year2)))[i],]
  mean.df<-NULL
  for (j in 1:1000){
    sub2<-sub[sample(nrow(sub), nrow(sub), replace = T),]
    sub3<-t(data.frame(colMeans(sub2[,3:12])))
    mean.df<-rbind(mean.df, sub3)
  }
  mean.df<-data.frame(mean.df)
  mean.df$year<-as.numeric(levels(factor(as.character(a$year2)))[i])
  a.mean<-rbind(a.mean, t(data.frame(colMeans(mean.df))))
}
a.mean<-data.frame(a.mean)

#ML
a.ml<-data.frame(a.mean$year, a.mean$entropy.ml)
colnames(a.ml)<-c("year", "x")
#breakpoint analysis
ml.lm<-lm(x ~ year, data=a.ml)
ml.seg <- segmented(ml.lm, seg.Z = ~ year, psi = list(year = c(1835)))
summary(ml.seg)
ml.fitted <- fitted(ml.seg)
ml.model <- data.frame(Year = a.ml$year, Frequency = ml.fitted)
ml.lines <- round_any(ml.seg$psi[, 2], 5, f = round)
ml.text<-as.character(unname(round_any(ml.seg$psi[, 2], 5, f = round)))
ml.summary<-summary(ml.seg)
if(ml.summary$Ttable[2,4]<.001){p<-.001} else {p<-substr(ml.summary$Ttable[2,4],1,5)}
R2<-substr(ml.summary$adj.r.squared, 1,4)
ml.p<-paste("R = ", R2," | ", "p < ", p, sep="" )

#DB
a.db<-data.frame(a.mean$year, a.mean$entropy.db)
colnames(a.db)<-c("year", "x")
#breakpoint analysis
db.lm<-lm(x ~ year, data=a.db)
db.seg <- segmented(db.lm, seg.Z = ~ year, psi = list(year = c(1835)))
summary(db.seg)
db.fitted <- fitted(db.seg)
db.model <- data.frame(Year = a.db$year, Frequency = db.fitted)
db.lines <- round_any(db.seg$psi[, 2], 5, f = round)
db.text<-as.character(unname(round_any(db.seg$psi[, 2], 5, f = round)))
db.summary<-summary(db.seg)
if(db.summary$Ttable[2,4]<.001){p<-.001} else {p<-substr(db.summary$Ttable[2,4], 1,5)}
R2<-substr(db.summary$adj.r.squared, 1,4)
db.p<-paste("R = ", R2," | ", "p < ", p, sep="" )

#plot
pg.ml<-ggplot(a.ml, aes(x=year, y=x))+
  geom_point(color="black") +
  geom_line(data = ml.model, aes(x = Year, y = Frequency), size=.5) +
  theme_classic() +
  geom_vline(xintercept = ml.lines) +
  geom_text(x=ml.lines[1]+12, y=.03, label=ml.text[1], size=3.5, check_overlap = TRUE) +
  geom_text(x=1920, y=.03, label=ml.p, size=3.5, check_overlap = TRUE) +
  labs(x = "Year", y = "Entropy", title="Method = ML")

pg.db<-ggplot(a.db, aes(x=year, y=x))+
  geom_point(color="black") +
  geom_line(data = db.model, aes(x = Year, y = Frequency), size=.5) +
  theme_classic() +
  geom_vline(xintercept = db.lines) +
  geom_text(x=db.lines[1]+12, y=0.4, label=db.text[1], size=3.5, check_overlap = TRUE) +
  geom_text(x=1920, y=0.4, label=db.p, size=3.5, check_overlap = TRUE) +
  labs(x = "Year", y = "", title="Method = DB")

#######  PG - FIC  ##########
setwd("~/Research/biodiversity")
a<-read.csv("PG_Results.csv")
#remove pre-1750 texts
a<-a[a$year > 1699,]
#transform into 5 year intervals
n <- 5
#save final digit as vector
five.year<-as.numeric(substr(a$year, 4, 4))
#round down to nearest 5 and append to date
five.year<-round_any(five.year, n, f = floor)
#replace column
a$year2<-as.character(a$year)
substring(a$year2,4,4)<-as.character(five.year)
a$year2<-as.numeric(a$year2)
pg.meta<-read.csv("PG_metadata_bio.csv")
pg.meta<-pg.meta[pg.meta$id %in% gsub(".txt","",a$filename),]

#keep only texts labeled "fiction" in "subjects" column
pg.fic<-pg.meta[grep("fiction", pg.meta$subjects, ignore.case = T),]
a.fic<-a[gsub(".txt","",a$filename) %in% pg.fic$id,]

#rerun bootstrapping

#bootstrap 5-year means
a.mean<-NULL
for (i in 1:nlevels(factor(as.character(a.fic$year2)))){
  print(i)
  sub<-a.fic[a.fic$year2 == levels(factor(as.character(a.fic$year2)))[i],]
  mean.df<-NULL
  for (j in 1:1000){
    sub2<-sub[sample(nrow(sub), nrow(sub), replace = T),]
    sub3<-t(data.frame(colMeans(sub2[,3:12])))
    mean.df<-rbind(mean.df, sub3)
  }
  mean.df<-data.frame(mean.df)
  mean.df$year<-as.numeric(levels(factor(as.character(a.fic$year2)))[i])
  a.mean<-rbind(a.mean, t(data.frame(colMeans(mean.df))))
}
a.mean<-data.frame(a.mean)

#remove outlier year
a.mean<-a.mean[-which(a.mean$freq.ml>(2*sd(a.mean$freq.ml))+mean(a.mean$freq.ml)),]

#ML
a.ml<-data.frame(a.mean$year, a.mean$ttr.ml)
colnames(a.ml)<-c("year", "x")
#breakpoint analysis
ml.lm<-lm(x ~ year, data=a.ml)
ml.seg <- segmented(ml.lm, seg.Z = ~ year, psi = list(year = c(1835)))
summary(ml.seg)
ml.fitted <- fitted(ml.seg)
ml.model <- data.frame(Year = a.ml$year, Frequency = ml.fitted)
ml.lines <- round_any(ml.seg$psi[, 2], 5, f = round)
ml.text<-as.character(unname(round_any(ml.seg$psi[, 2], 5, f = round)))
ml.summary<-summary(ml.seg)
if(ml.summary$Ttable[2,4]<.001){p<-.001} else {p<-substr(ml.summary$Ttable[2,4],1,5)}
R2<-substr(ml.summary$adj.r.squared, 1,4)
ml.p<-paste("R = ", R2," | ", "p < ", p, sep="" )

#DB
a.db<-data.frame(a.mean$year, a.mean$ttr.db)
colnames(a.db)<-c("year", "x")
#breakpoint analysis
db.lm<-lm(x ~ year, data=a.db)
db.seg <- segmented(db.lm, seg.Z = ~ year, psi = list(year = c(1835)))
summary(db.seg)
db.fitted <- fitted(db.seg)
db.model <- data.frame(Year = a.db$year, Frequency = db.fitted)
db.lines <- round_any(db.seg$psi[, 2], 5, f = round)
db.text<-as.character(unname(round_any(db.seg$psi[, 2], 5, f = round)))
db.summary<-summary(db.seg)
if(db.summary$Ttable[2,4]<.001){p<-.001} else {p<-substr(db.summary$Ttable[2,4], 1,5)}
R2<-substr(db.summary$adj.r.squared, 1,4)
db.p<-paste("R = ", R2," | ", "p < ", p, sep="" )

#plot
pg.fic.ml<-ggplot(a.ml, aes(x=year, y=x))+
  geom_point(color="black") +
  geom_line(data = ml.model, aes(x = Year, y = Frequency), size=.5) +
  theme_classic() +
  geom_vline(xintercept = ml.lines) +
  geom_text(x=ml.lines[1]+12, y=.03, label=ml.text[1], size=3.5, check_overlap = TRUE) +
  geom_text(x=1840, y=.03, label=ml.p, size=3.5, check_overlap = TRUE) +
  labs(x = "Year", y = "Entropy")

pg.fic.db<-ggplot(a.db, aes(x=year, y=x))+
  geom_point(color="black") +
  geom_line(data = db.model, aes(x = Year, y = Frequency), size=.5) +
  theme_classic() +
  geom_vline(xintercept = db.lines) +
  geom_text(x=db.lines[1]+18, y=1, label=db.text[1], size=3.5, check_overlap = TRUE) +
  geom_text(x=1850, y=1, label=db.p, size=3.5, check_overlap = TRUE) +
  labs(x = "Year", y = "")

########  SC  #########
setwd("~/Research/biodiversity")
a<-read.csv("Stan_Chicago_Results.csv")

#transform into 5 year intervals
n <- 5
#save final digit as vector
five.year<-as.numeric(substr(a$year, 4, 4))
#round down to nearest 5 and append to date
five.year<-round_any(five.year, n, f = floor)
#replace column
a$year2<-as.character(a$year)
substring(a$year2,4,4)<-as.character(five.year)
a$year2<-as.numeric(a$year2)

#get 5 year average for each measure
a.mean<-aggregate(a[,3:12],by=list(factor(a$year2)), mean)
colnames(a.mean)[1]<-c("year")
a.mean$year<-as.numeric(as.character(a.mean$year))

#remove 0 year == 1 observation
a<-a[-which(a$year2 == 1780),]

#ML
a.ml<-data.frame(a.mean$year, a.mean$entropy.ml)
colnames(a.ml)<-c("year", "x")
#breakpoint analysis
ml.lm<-lm(x ~ year, data=a.ml)
ml.seg <- segmented(ml.lm, seg.Z = ~ year, psi = list(year = c(1835)))
summary(ml.seg)
ml.fitted <- fitted(ml.seg)
ml.model <- data.frame(Year = a.ml$year, Frequency = ml.fitted)
ml.lines <- round_any(ml.seg$psi[, 2], 5, f = round)
ml.text<-as.character(unname(round_any(ml.seg$psi[, 2], 5, f = round)))
ml.summary<-summary(ml.seg)
if(ml.summary$Ttable[2,4]<.001){p<-.001} else {p<-substr(ml.summary$Ttable[2,4],1,5)}
R2<-substr(ml.summary$adj.r.squared, 1,4)
ml.p<-paste("R = ", R2," | ", "p < ", p, sep="" )

#DB
a.db<-data.frame(a.mean$year, a.mean$entropy.db)
colnames(a.db)<-c("year", "x")
#breakpoint analysis
db.lm<-lm(x ~ year, data=a.db)
db.seg <- segmented(db.lm, seg.Z = ~ year, psi = list(year = c(1835)))
summary(db.seg)
db.fitted <- fitted(db.seg)
db.model <- data.frame(Year = a.db$year, Frequency = db.fitted)
db.lines <- round_any(db.seg$psi[, 2], 5, f = round)
db.text<-as.character(unname(round_any(db.seg$psi[, 2], 5, f = round)))
db.summary<-summary(db.seg)
if(db.summary$Ttable[2,4]<.001){p<-.001} else {p<-substr(db.summary$Ttable[2,4], 1,5)}
R2<-substr(db.summary$adj.r.squared, 1,4)
db.p<-paste("R = ", R2," | ", "p < ", p, sep="" )

#plot
sc.ml<-ggplot(a.ml, aes(x=year, y=x))+
  geom_point(color="black") +
  geom_line(data = ml.model, aes(x = Year, y = Frequency), size=.5) +
  theme_classic() +
  geom_vline(xintercept = ml.lines) +
  geom_text(x=ml.lines[1]+12, y=.25, label=ml.text[1], size=3.5, check_overlap = TRUE) +
  geom_text(x=1960, y=.25, label=ml.p, size=3.5, check_overlap = TRUE) +
  labs(x = "Year", y = "Entropy")

sc.db<-ggplot(a.db, aes(x=year, y=x))+
  geom_point(color="black") +
  geom_line(data = db.model, aes(x = Year, y = Frequency), size=.5) +
  theme_classic() +
  geom_vline(xintercept = db.lines) +
  geom_text(x=db.lines[1]+12, y=0.73, label=db.text[1], size=3.5, check_overlap = TRUE) +
  geom_text(x=1960, y=0.73, label=db.p, size=3.5, check_overlap = TRUE) +
  labs(x = "Year", y = "")

#######  FIG. 6 - DIVERSITY  ########
pg.ml+pg.db+pg.fic.ml+pg.fic.db+sc.ml+sc.db+plot_layout(ncol=2)+
  plot_annotation(title="Diversity", theme=theme(plot.title = element_text(hjust=0.5)))+
  plot_annotation(tag_levels = list(c("PG", "", "PG - FIC","", "SC","")))


##### Calculate increased rate of DB method to ML  #######
setwd("~/Research/biodiversity")
a<-read.csv("Stan_Chicago_Results.csv")

#transform into 5 year intervals
n <- 5
#save final digit as vector
five.year<-as.numeric(substr(a$year, 4, 4))
#round down to nearest 5 and append to date
five.year<-round_any(five.year, n, f = floor)
#replace column
a$year2<-as.character(a$year)
substring(a$year2,4,4)<-as.character(five.year)
a$year2<-as.numeric(a$year2)

#get 5 year average for each measure
a.mean<-aggregate(a[,3:12],by=list(factor(a$year2)), mean)
colnames(a.mean)[1]<-c("year")
a.mean$year<-as.numeric(as.character(a.mean$year))

#remove 0 year == 1 observation
a<-a[-which(a$year2 == 1780),]

mean(a$freq.db)/mean(a$freq.ml)

####### Correlation of Abundance with Richness #######

ggplot(a, aes(x=freq.ml, y=ttr.ml))+
  geom_point(color="black") +
  #geom_line(data = db.model, aes(x = Year, y = Frequency), size=.5) +
  theme_classic() +
  #geom_vline(xintercept = db.lines) +
  #geom_text(x=db.lines[1]+12, y=0.73, label=db.text[1], size=3.5, check_overlap = TRUE) +
  #geom_text(x=1960, y=0.73, label=db.p, size=3.5, check_overlap = TRUE) +
  labs(x = "Abundance", y = "Richness", title="Correlation of Abundance and Richness (r = 0.89)", caption = "Dataset = SC")
  #geom_smooth(method = "lm")

cor.test(a$freq.ml, a$ttr.ml)

#######  Animal v. plant   #######
library(zoo)

#Hathi
setwd("~/Data/Hathi1M")
a<-read.csv("Hathi1M_YearlyData.csv")
a1<-a[a$genre == "FIC",]
b<-a1[,colnames(a1) %in% c("noun.plant", "noun.animal", "year")]
#take 10-year moving average
b$noun.animal<-rollmean(b$noun.animal, 10, na.pad = T)
b$noun.plant<-rollmean(b$noun.plant, 10, na.pad = T)
b.long<-melt(b, id.vars=c("year"))
b.long$collection<-c("Hathi1M")
b.long$variable<-factor(gsub("noun.", "",b.long$variable))

#SC
setwd("~/Research/biodiversity")
a<-read.csv("Stan_Chicago_Results.csv")
#get yearly average for each measure
a.mean<-aggregate(a[,3:11],by=list(factor(a$year)), mean)
colnames(a.mean)[1]<-c("year")
a.mean$year<-as.numeric(as.character(a.mean$year))
c<-a.mean[,1:3]
#remove outlier years
cut<-mean(c$animal)+(3*sd(c$animal))
c<-c[c$animal < cut,]
c$animal<-rollmean(c$animal, 10, na.pad = T)
c$plant<-rollmean(c$plant, 10, na.pad = T)
c.long<-melt(c, id.vars=c("year"))
c.long<-c.long[c.long$year > 1799 & c.long$year < 2000,]
c.long$collection<-c("StanfordChicago")

#combine
d<-rbind(b.long, c.long)
d$collection<-factor(d$collection)

#plot altogether as facet graph
ggplot(d, aes(x=year, y=value, linetype=variable))+
  geom_line() +
  facet_wrap( ~ collection, nrow = 1) + 
  #xlab("Year") +
  #ylab("Frequency") + 
  theme_bw() +
  theme(legend.position = "bottom") + 
  theme(legend.title=element_blank()) +
  #scale_colour_discrete(labels=c("", "Plant")) +
  guides(color = "none") +
  scale_linetype_discrete(labels=c("Animals", "Plants")) +
  labs(x = "Year", y = "Frequency", title = "Frequency of Plants and Animals in English Fiction, 1800-2000")



